library(dplyr)
library(magrittr)
library(tidyr)
library(ggplot2)

# Download and process raw ngram data from Google
download.file('http://storage.googleapis.com/books/ngrams/books/20200217/eng/totalcounts-1', './data/totalcounts-1')

# I find curl is most reliable on Linux for large files.
# Windows users may prefer R's internal download.file
system("curl http://storage.googleapis.com/books/ngrams/books/20200217/eng/1-00017-of-00024.gz -o ./data/1-00017-of-00024.gz")
system("curl http://storage.googleapis.com/books/ngrams/books/20200217/eng/1-00022-of-00024.gz -o ./data/1-00022-of-00024.gz")

# download.file('http://storage.googleapis.com/books/ngrams/books/20200217/eng/1-00017-of-00024.gz', './data/1-00017-of-00024.gz')
# download.file('http://storage.googleapis.com/books/ngrams/books/20200217/eng/1-00022-of-00024.gz', './data/1-00022-of-00024.gz')

system("gunzip -f ./data/1-00017-of-00024.gz")
system("gunzip -f ./data/1-00022-of-00024.gz")

system("grep -i -P '^democracy\t' ./data/1-00017-of-00024 > ./data/democracy.txt")
system("grep -i -P '^republic\t' ./data/1-00022-of-00024 > ./data/republic.txt")
system("sed -i 's/\\t/\\n/g' ./data/totalcounts-1") # note double escape on regex from within R

# Load totals
totals <- data.table::fread("./data/totalcounts-1",
                            sep = ",", header = F,
                            col.names = c("year", "match_count",
                                          "page_count", "volume_count"))
years_df <- data.frame(year = min(totals$year):max(totals$year))

# Load democracy
dem_raw <- readLines("./data/democracy.txt")

# this is super hacky and ugly and slow but i'm being lazy
dem_df <- data.frame()
for(i in 1:length(dem_raw)){
  split_l <- unlist(strsplit(dem_raw[i], '\t'))
  ngram <- split_l[1]
  t1 <- data.frame()
  for(j in 2:length(split_l)){
    split_r <- unlist(strsplit(split_l[j], ','))
    t2 <- data.frame(ngram = ngram, year = split_r[1],
                     match_count = split_r[2], volume_count = split_r[3])
    t1 <- bind_rows(t1, t2)
  }
  dem_df <- bind_rows(dem_df, t1)
}

dem_df %<>%
  mutate(year = as.numeric(year),
         match_count = as.numeric(match_count),
         volume_count = as.numeric(volume_count))

dem_sm <- dem_df %>%
  group_by(year) %>%
  summarize(n = sum(match_count),
            vols = sum(volume_count)) %>%
  ungroup() %>%
  left_join(totals, by = "year") %>%
  right_join(years_df, by = "year") %>%
  arrange(year) %>%
  mutate(n_pct = n / match_count,
         vol_pct = vols / volume_count) %>%
  mutate(n_pct = ifelse(is.na(n_pct), 0, n_pct),
         vol_pct = ifelse(is.na(vol_pct), 0, vol_pct), 
         n_pct_mm = zoo::rollmean(n_pct, k = 10, na.pad = T),
         n_pct_mm_30 = zoo::rollmean(n_pct, k = 30, na.pad = T),
         n_pct_mm_50 = zoo::rollmean(n_pct, k = 50, na.pad = T), 
         vol_pct_mm = zoo::rollmean(vol_pct, k = 10, na.pad = T), 
         series = 'Democracy') %>%
  filter(year >= 1600)

# Load republic
rep_raw <- readLines("./data/republic.txt")

# this is super hacky and ugly and slow but i'm being lazy
rep_df <- data.frame()
for(i in 1:length(rep_raw)){
  split_l <- unlist(strsplit(rep_raw[i], '\t'))
  ngram <- split_l[1]
  t1 <- data.frame()
  for(j in 2:length(split_l)){
    split_r <- unlist(strsplit(split_l[j], ','))
    t2 <- data.frame(ngram = ngram, year = split_r[1],
                     match_count = split_r[2], volume_count = split_r[3])
    t1 <- bind_rows(t1, t2)
  }
  rep_df <- bind_rows(rep_df, t1)
}

rep_df %<>%
  mutate(year = as.numeric(year),
         match_count = as.numeric(match_count),
         volume_count = as.numeric(volume_count))

rep_sm <- rep_df %>%
  group_by(year) %>%
  summarize(n = sum(match_count),
            vols = sum(volume_count)) %>%
  ungroup() %>%
  left_join(totals, by = "year") %>%
  right_join(years_df, by = "year") %>%
  arrange(year) %>%
  mutate(n_pct = n / match_count,
         vol_pct = vols / volume_count) %>%
  mutate(n_pct = ifelse(is.na(n_pct), 0, n_pct),
         vol_pct = ifelse(is.na(vol_pct), 0, vol_pct), 
         n_pct_mm = zoo::rollmean(n_pct, k = 10, na.pad = T),
         n_pct_mm_30 = zoo::rollmean(n_pct, k = 30, na.pad = T),
         n_pct_mm_50 = zoo::rollmean(n_pct, k = 50, na.pad = T), 
         vol_pct_mm = zoo::rollmean(vol_pct, k = 10, na.pad = T), 
         series = 'Republic') %>%
  filter(year >= 1600)


# Join the two
dat <- bind_rows(dem_sm, rep_sm)

# Plot the results
p50 <- ggplot(dat, aes(x = year, y = n_pct_mm_50, group = series)) +
  geom_line(aes(linetype = series)) +
  annotate("text", x = 2000, y = 2.55e-5, label = "Democracy") +
  annotate("text", x = 2000, y = 7.4e-6, label = "Republic") +
  theme(panel.background = element_blank(),
        axis.line = element_line(color = "black"),
        legend.key.size = unit(1, "cm"),
        legend.key = element_rect(fill = "white"),
        legend.position = "none") + 
  labs(x = "Year", y = "", linetype = "ngram")
p50

# ggsave(p50, file = "figure_2_1.png", dpi = 150, height = 8.5, width = 11)
ggsave(p50, file = "./output/figure_2_1.tiff", dpi = 300, height = 8.5, width = 11)

